library(readxl)
library(readr)
library(dplyr)
library(here)
library(tidyverse)

Read in xlsx file

sheet1 <- read_xlsx(here("raw_data/boing-boing-candy-2015.xlsx"))
sheet1

Inspect the data i.e. dimensions, variable types, variable names, etc.?

dim(sheet1)
colnames(sheet1)
glimpse(sheet1)

Tidy data - long format

sheet1_long <- sheet1 %>%
  pivot_longer(cols = starts_with("["),  #converting rows to columns 
               names_to = "candy",
               values_to = "rating")

#check dimensions, column names
dim(sheet1_long)
colnames(sheet1_long)

#Omitting columns that are not required

sheet1_long <- sheet1_long %>%
  select(-c(4:29))

colnames(sheet1_long)
[1] "Timestamp"                                                "How old are you?"                                         "Are you going actually going trick or treating yourself?"
[4] "candy"                                                    "rating"                                                  
dim(sheet1_long)
[1] 534850      5
colnames(sheet1_long) <- c("timestamp", "age", "going_out", "candy", "rating")
colnames(sheet1_long)
[1] "timestamp" "age"       "going_out" "candy"     "rating"   

#Add column ‘year’

sheet1_long <- sheet1_long %>%
  add_column(year = 2015)
head(sheet1_long)

Read in xlsx file

sheet2 <- read_xlsx(here("raw_data/boing-boing-candy-2016.xlsx"))
sheet2

Inspect the data i.e. dimensions, variable types, variable names, etc.?

dim(sheet2)
colnames(sheet2)
glimpse(sheet2)

Tidy data - long format

sheet2_long <- sheet2 %>%
  pivot_longer(cols = starts_with("["),  #converting rows to columns 
               names_to = "candy",
               values_to = "rating")

dim(sheet2_long)
colnames(sheet2_long)

#Omitting columns that are not required

sheet2_long <- sheet2_long %>%
  select(-c(6:22))

colnames(sheet2_long)
[1] "Timestamp"                                                "Are you going actually going trick or treating yourself?" "Your gender:"                                            
[4] "How old are you?"                                         "Which country do you live in?"                            "candy"                                                   
[7] "rating"                                                  
dim(sheet2_long)
[1] 127159      7
colnames(sheet2_long) <- c("timestamp", "going_out", "gender", "age", "country", "candy", "rating")
colnames(sheet2_long)
[1] "timestamp" "going_out" "gender"    "age"       "country"   "candy"     "rating"   

#Add column ‘year’

sheet2_long <- sheet2_long %>%
  add_column(year = 2016)
head(sheet2_long)

Read in xlsx file

sheet3 <- read_xlsx(here("raw_data/boing-boing-candy-2017.xlsx"))
sheet3

Inspect the data i.e. dimensions, variable types, variable names, etc.?

dim(sheet3)
colnames(sheet3)
glimpse(sheet3)

Tidy data - long format

sheet3_long <- sheet3 %>%
  pivot_longer(cols = starts_with("Q6"),  #converting rows to columns 
               names_to = "candy",
               values_to = "rating")
colnames(sheet3_long)
 [1] "Internal ID"                      "Q1: GOING OUT?"                   "Q2: GENDER"                       "Q3: AGE"                          "Q4: COUNTRY"                     
 [6] "Q5: STATE, PROVINCE, COUNTY, ETC" "Q7: JOY OTHER"                    "Q8: DESPAIR OTHER"                "Q9: OTHER COMMENTS"               "Q10: DRESS"                      
[11] "...114"                           "Q11: DAY"                         "Q12: MEDIA [Daily Dish]"          "Q12: MEDIA [Science]"             "Q12: MEDIA [ESPN]"               
[16] "Q12: MEDIA [Yahoo]"               "Click Coordinates (x, y)"         "candy"                            "rating"                          

#Omitting columns that are not required

sheet3_long <- sheet3_long %>%
  select(-c(6:17))
colnames(sheet3_long)
[1] "Internal ID"    "Q1: GOING OUT?" "Q2: GENDER"     "Q3: AGE"        "Q4: COUNTRY"    "candy"          "rating"        
dim(sheet3_long)
[1] 253380      7

#Change col names

colnames(sheet3_long) <- c("internal_id", "going_out", "gender", "age", "country", "candy", "rating")
colnames(sheet3_long)
[1] "internal_id" "going_out"   "gender"      "age"         "country"     "candy"       "rating"     
sheet3_long
#summarise((count = sum(is.na(rating))))

#Add column ‘year’

sheet3_long <- sheet3_long %>%
  add_column(year = 2017)
head(sheet3_long)

#X X X X X X X X X X X X X X X X X X X X X X X X X X X X X X

Combine all sheets


candy_combined <- bind_rows(sheet1_long, sheet2_long, sheet3_long)

colnames(candy_combined)
[1] "timestamp"   "age"         "going_out"   "candy"       "rating"      "year"        "gender"      "country"     "internal_id"
dim(candy_combined)
[1] 915389      9
head(candy_combined)
write.xlsx(here(candy_combined, file = 'raw_data/candy_combined.xlsx', colNames = TRUE))
Error in saveWorkbook(wb = wb, file = file, overwrite = overwrite) : 
  argument "file" is missing, with no default

#XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX xXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXxx

#Read in combined data file

candy_clean <- read_csv(here("raw_data/candy_combined.csv"))
Parsed with column specification:
cols(
  timestamp = col_datetime(format = ""),
  age = col_double(),
  going_out = col_character(),
  candy = col_character(),
  rating = col_character(),
  year = col_double(),
  gender = col_logical(),
  country = col_logical(),
  internal_id = col_logical()
)
1014402 parsing failures.
 row col               expected actual                                                                             file
5131 age no trailing characters     's '/Users/user/Codeclan_work/Dirty_Data_Project/task4/raw_data/candy_combined.csv'
5132 age no trailing characters     's '/Users/user/Codeclan_work/Dirty_Data_Project/task4/raw_data/candy_combined.csv'
5133 age no trailing characters     's '/Users/user/Codeclan_work/Dirty_Data_Project/task4/raw_data/candy_combined.csv'
5134 age no trailing characters     's '/Users/user/Codeclan_work/Dirty_Data_Project/task4/raw_data/candy_combined.csv'
5135 age no trailing characters     's '/Users/user/Codeclan_work/Dirty_Data_Project/task4/raw_data/candy_combined.csv'
.... ... ...................... ...... ................................................................................
See problems(...) for more details.
head(candy_clean)

#Changing column data type

candy_clean$age <- as.integer(candy_clean$age)
Warning messages:
1: Unknown or uninitialised column: `head`. 
2: Unknown or uninitialised column: `head`. 
candy_clean$year <- as.integer(candy_clean$year)
candy_clean$gender <- as.character(candy_clean$gender)
candy_clean$country <- as.character(candy_clean$country)
candy_clean$internal_id <- as.integer(candy_clean$internal_id)
head(candy_clean)
NA
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKYGBge3J9CmxpYnJhcnkocmVhZHhsKQpsaWJyYXJ5KHJlYWRyKQpsaWJyYXJ5KGRwbHlyKQpsaWJyYXJ5KGhlcmUpCmxpYnJhcnkodGlkeXZlcnNlKQpgYGAKCiMgUmVhZCBpbiB4bHN4IGZpbGUKCmBgYHtyfQpzaGVldDEgPC0gcmVhZF94bHN4KGhlcmUoInJhd19kYXRhL2JvaW5nLWJvaW5nLWNhbmR5LTIwMTUueGxzeCIpKQpzaGVldDEKYGBgCgojIEluc3BlY3QgdGhlIGRhdGEgaS5lLiBkaW1lbnNpb25zLCB2YXJpYWJsZSB0eXBlcywgdmFyaWFibGUgbmFtZXMsIGV0Yy4/CgpgYGB7cn0KZGltKHNoZWV0MSkKY29sbmFtZXMoc2hlZXQxKQpnbGltcHNlKHNoZWV0MSkKYGBgCgojIFRpZHkgZGF0YSAtIGxvbmcgZm9ybWF0CgpgYGB7cn0Kc2hlZXQxX2xvbmcgPC0gc2hlZXQxICU+JQogIHBpdm90X2xvbmdlcihjb2xzID0gc3RhcnRzX3dpdGgoIlsiKSwgICNjb252ZXJ0aW5nIHJvd3MgdG8gY29sdW1ucyAKICAgICAgICAgICAgICAgbmFtZXNfdG8gPSAiY2FuZHkiLAogICAgICAgICAgICAgICB2YWx1ZXNfdG8gPSAicmF0aW5nIikKCiNjaGVjayBkaW1lbnNpb25zLCBjb2x1bW4gbmFtZXMKZGltKHNoZWV0MV9sb25nKQpjb2xuYW1lcyhzaGVldDFfbG9uZykKYGBgCgojT21pdHRpbmcgY29sdW1ucyB0aGF0IGFyZSBub3QgcmVxdWlyZWQKCmBgYHtyfQpzaGVldDFfbG9uZyA8LSBzaGVldDFfbG9uZyAlPiUKICBzZWxlY3QoLWMoNDoyOSkpCgpjb2xuYW1lcyhzaGVldDFfbG9uZykKZGltKHNoZWV0MV9sb25nKQpgYGAKCgpgYGB7cn0KY29sbmFtZXMoc2hlZXQxX2xvbmcpIDwtIGMoInRpbWVzdGFtcCIsICJhZ2UiLCAiZ29pbmdfb3V0IiwgImNhbmR5IiwgInJhdGluZyIpCmNvbG5hbWVzKHNoZWV0MV9sb25nKQoKYGBgCgojQWRkIGNvbHVtbiAneWVhcicKCmBgYHtyfQpzaGVldDFfbG9uZyA8LSBzaGVldDFfbG9uZyAlPiUKICBhZGRfY29sdW1uKHllYXIgPSAyMDE1KQpoZWFkKHNoZWV0MV9sb25nKQpgYGAKCgojIFJlYWQgaW4geGxzeCBmaWxlCgpgYGB7cn0Kc2hlZXQyIDwtIHJlYWRfeGxzeChoZXJlKCJyYXdfZGF0YS9ib2luZy1ib2luZy1jYW5keS0yMDE2Lnhsc3giKSkKc2hlZXQyCmBgYAoKIyBJbnNwZWN0IHRoZSBkYXRhIGkuZS4gZGltZW5zaW9ucywgdmFyaWFibGUgdHlwZXMsIHZhcmlhYmxlIG5hbWVzLCBldGMuPwoKYGBge3J9CmRpbShzaGVldDIpCmNvbG5hbWVzKHNoZWV0MikKZ2xpbXBzZShzaGVldDIpCmBgYAoKIyBUaWR5IGRhdGEgLSBsb25nIGZvcm1hdAoKYGBge3J9CnNoZWV0Ml9sb25nIDwtIHNoZWV0MiAlPiUKICBwaXZvdF9sb25nZXIoY29scyA9IHN0YXJ0c193aXRoKCJbIiksICAjY29udmVydGluZyByb3dzIHRvIGNvbHVtbnMgCiAgICAgICAgICAgICAgIG5hbWVzX3RvID0gImNhbmR5IiwKICAgICAgICAgICAgICAgdmFsdWVzX3RvID0gInJhdGluZyIpCgpkaW0oc2hlZXQyX2xvbmcpCmNvbG5hbWVzKHNoZWV0Ml9sb25nKQpgYGAKCiNPbWl0dGluZyBjb2x1bW5zIHRoYXQgYXJlIG5vdCByZXF1aXJlZAoKYGBge3J9CnNoZWV0Ml9sb25nIDwtIHNoZWV0Ml9sb25nICU+JQogIHNlbGVjdCgtYyg2OjIyKSkKCmNvbG5hbWVzKHNoZWV0Ml9sb25nKQpkaW0oc2hlZXQyX2xvbmcpCmBgYAoKCmBgYHtyfQpjb2xuYW1lcyhzaGVldDJfbG9uZykgPC0gYygidGltZXN0YW1wIiwgImdvaW5nX291dCIsICJnZW5kZXIiLCAiYWdlIiwgImNvdW50cnkiLCAiY2FuZHkiLCAicmF0aW5nIikKY29sbmFtZXMoc2hlZXQyX2xvbmcpCmBgYAoKI0FkZCBjb2x1bW4gJ3llYXInCgpgYGB7cn0Kc2hlZXQyX2xvbmcgPC0gc2hlZXQyX2xvbmcgJT4lCiAgYWRkX2NvbHVtbih5ZWFyID0gMjAxNikKaGVhZChzaGVldDJfbG9uZykKYGBgCgoKCiMgUmVhZCBpbiB4bHN4IGZpbGUKCgpgYGB7cn0Kc2hlZXQzIDwtIHJlYWRfeGxzeChoZXJlKCJyYXdfZGF0YS9ib2luZy1ib2luZy1jYW5keS0yMDE3Lnhsc3giKSkKc2hlZXQzCmBgYAoKIyBJbnNwZWN0IHRoZSBkYXRhIGkuZS4gZGltZW5zaW9ucywgdmFyaWFibGUgdHlwZXMsIHZhcmlhYmxlIG5hbWVzLCBldGMuPwoKYGBge3J9CmRpbShzaGVldDMpCmNvbG5hbWVzKHNoZWV0MykKZ2xpbXBzZShzaGVldDMpCmBgYAoKIyBUaWR5IGRhdGEgLSBsb25nIGZvcm1hdAoKYGBge3J9CnNoZWV0M19sb25nIDwtIHNoZWV0MyAlPiUKICBwaXZvdF9sb25nZXIoY29scyA9IHN0YXJ0c193aXRoKCJRNiIpLCAgI2NvbnZlcnRpbmcgcm93cyB0byBjb2x1bW5zIAogICAgICAgICAgICAgICBuYW1lc190byA9ICJjYW5keSIsCiAgICAgICAgICAgICAgIHZhbHVlc190byA9ICJyYXRpbmciKQpjb2xuYW1lcyhzaGVldDNfbG9uZykKYGBgCgojT21pdHRpbmcgY29sdW1ucyB0aGF0IGFyZSBub3QgcmVxdWlyZWQKCmBgYHtyfQpzaGVldDNfbG9uZyA8LSBzaGVldDNfbG9uZyAlPiUKICBzZWxlY3QoLWMoNjoxNykpCmNvbG5hbWVzKHNoZWV0M19sb25nKQpkaW0oc2hlZXQzX2xvbmcpCmBgYAoKCiNDaGFuZ2UgY29sIG5hbWVzCgpgYGB7cn0KY29sbmFtZXMoc2hlZXQzX2xvbmcpIDwtIGMoImludGVybmFsX2lkIiwgImdvaW5nX291dCIsICJnZW5kZXIiLCAiYWdlIiwgImNvdW50cnkiLCAiY2FuZHkiLCAicmF0aW5nIikKY29sbmFtZXMoc2hlZXQzX2xvbmcpCgpzaGVldDNfbG9uZwojc3VtbWFyaXNlKChjb3VudCA9IHN1bShpcy5uYShyYXRpbmcpKSkpCmBgYAoKCiNBZGQgY29sdW1uICd5ZWFyJwoKYGBge3J9CnNoZWV0M19sb25nIDwtIHNoZWV0M19sb25nICU+JQogIGFkZF9jb2x1bW4oeWVhciA9IDIwMTcpCmhlYWQoc2hlZXQzX2xvbmcpCmBgYAoKI1ggWCBYIFggWCBYIFggWCBYIFggWCBYIFggWCBYIFggWCBYIFggWCBYIFggWCBYIFggWCBYIFggWCBYIAoKIyBDb21iaW5lIGFsbCBzaGVldHMgCgpgYGB7cn0KCmNhbmR5X2NvbWJpbmVkIDwtIGJpbmRfcm93cyhzaGVldDFfbG9uZywgc2hlZXQyX2xvbmcsIHNoZWV0M19sb25nKQoKY29sbmFtZXMoY2FuZHlfY29tYmluZWQpCmRpbShjYW5keV9jb21iaW5lZCkKaGVhZChjYW5keV9jb21iaW5lZCkKYGBgCgoKYGBge3J9CmxpYnJhcnkob3Blbnhsc3gpCndyaXRlLnhsc3goaGVyZShjYW5keV9jb21iaW5lZCwgZmlsZSA9ICdyYXdfZGF0YS9jYW5keV9jb21iaW5lZC54bHN4JywgY29sTmFtZXMgPSBUUlVFKSkKYGBgCgoKI1hYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFggICAgICAgICAgICAgICBYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYICAgICAgICAgICAgICAgeFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWHh4CgojUmVhZCBpbiBjb21iaW5lZCBkYXRhIGZpbGUKCmBgYHtyfQpjYW5keV9jbGVhbiA8LSByZWFkX2NzdihoZXJlKCJyYXdfZGF0YS9jYW5keV9jb21iaW5lZC5jc3YiKSkKaGVhZChjYW5keV9jbGVhbikKYGBgCgoKI0NoYW5naW5nIGNvbHVtbiBkYXRhIHR5cGUKCmBgYHtyfQpjYW5keV9jbGVhbiRhZ2UgPC0gYXMuaW50ZWdlcihjYW5keV9jbGVhbiRhZ2UpCmNhbmR5X2NsZWFuJHllYXIgPC0gYXMuaW50ZWdlcihjYW5keV9jbGVhbiR5ZWFyKQpjYW5keV9jbGVhbiRnZW5kZXIgPC0gYXMuY2hhcmFjdGVyKGNhbmR5X2NsZWFuJGdlbmRlcikKY2FuZHlfY2xlYW4kY291bnRyeSA8LSBhcy5jaGFyYWN0ZXIoY2FuZHlfY2xlYW4kY291bnRyeSkKY2FuZHlfY2xlYW4kaW50ZXJuYWxfaWQgPC0gYXMuaW50ZWdlcihjYW5keV9jbGVhbiRpbnRlcm5hbF9pZCkKaGVhZChjYW5keV9jbGVhbikKCmBgYAoKCmBgYHtyfQpkaXN0aW5jdChjYW5keV9jbGVhbiwgImNvdW50cnkiKQpgYGAKCgoKCgoK